import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objs as go
def get_metrics(y_true, y_pred, model_name):
return [model_name, accuracy_score(y_true, y_pred), f1_score(y_true, y_pred),precision_score(y_true, y_pred), recall_score(y_true, y_pred)]
df = pd.read_csv('datasets/mushrooms.csv')
df.head()
| class | cap-shape | cap-surface | cap-color | bruises | odor | gill-attachment | gill-spacing | gill-size | gill-color | ... | stalk-surface-below-ring | stalk-color-above-ring | stalk-color-below-ring | veil-type | veil-color | ring-number | ring-type | spore-print-color | population | habitat | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | p | x | s | n | t | p | f | c | n | k | ... | s | w | w | p | w | o | p | k | s | u |
| 1 | e | x | s | y | t | a | f | c | b | k | ... | s | w | w | p | w | o | p | n | n | g |
| 2 | e | b | s | w | t | l | f | c | b | n | ... | s | w | w | p | w | o | p | n | n | m |
| 3 | p | x | y | w | t | p | f | c | n | n | ... | s | w | w | p | w | o | p | k | s | u |
| 4 | e | x | s | g | f | n | f | w | b | k | ... | s | w | w | p | w | o | e | n | a | g |
5 rows × 23 columns
df.shape
(8124, 23)
df.dtypes
class object cap-shape object cap-surface object cap-color object bruises object odor object gill-attachment object gill-spacing object gill-size object gill-color object stalk-shape object stalk-root object stalk-surface-above-ring object stalk-surface-below-ring object stalk-color-above-ring object stalk-color-below-ring object veil-type object veil-color object ring-number object ring-type object spore-print-color object population object habitat object dtype: object
Before we apply any ML models, we need to examine wether there are missing values in the data.
df.isna().any().any()
False
**Conclusion**: There are not features with missing values in the dataset.
In order to perform the supervised learning algorithms discussed in the course Machine Learning, we need to encode the features. For this purpouse sklearn.preprocessing.LabelEncoder is going to be leveraged.
label_encoder_data = df.copy()
label_encoder = LabelEncoder()
for col in df.columns:
label_encoder_data[col] = label_encoder.fit_transform(label_encoder_data[col])
df = label_encoder_data
**Conclusion**: All of the data are numerical and can be represented within vector spaces.
For visualizations libraries plotly and seaborn are going to be used, the same can be achived using ma
for feature in df.columns.to_list():
fig = px.histogram(df, x=feature, color="class", marginal="rug", hover_data=df.columns)
fig.show()
fig = px.scatter_matrix(df)
fig.show()
df.corr()
| class | cap-shape | cap-surface | cap-color | bruises | odor | gill-attachment | gill-spacing | gill-size | gill-color | ... | stalk-surface-below-ring | stalk-color-above-ring | stalk-color-below-ring | veil-type | veil-color | ring-number | ring-type | spore-print-color | population | habitat | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| class | 1.000000 | 0.052951 | 0.178446 | -0.031384 | -0.501530 | -0.093552 | 0.129200 | -0.348387 | 0.540024 | -0.530566 | ... | -0.298801 | -0.154003 | -0.146730 | NaN | 0.145142 | -0.214366 | -0.411771 | 0.171961 | 0.298686 | 0.217179 |
| cap-shape | 0.052951 | 1.000000 | -0.050454 | -0.048203 | -0.035374 | -0.021935 | 0.078865 | 0.013196 | 0.054050 | -0.006039 | ... | -0.032591 | -0.031659 | -0.030390 | NaN | 0.072560 | -0.106534 | -0.025457 | -0.073416 | 0.063413 | -0.042221 |
| cap-surface | 0.178446 | -0.050454 | 1.000000 | -0.019402 | 0.070228 | 0.045233 | -0.034180 | -0.282306 | 0.208100 | -0.161017 | ... | 0.107965 | 0.066050 | 0.068885 | NaN | -0.016603 | -0.026147 | -0.106407 | 0.230364 | 0.021555 | 0.163887 |
| cap-color | -0.031384 | -0.048203 | -0.019402 | 1.000000 | -0.000764 | -0.387121 | 0.041436 | 0.144259 | -0.169464 | 0.084659 | ... | -0.047710 | 0.002364 | 0.008057 | NaN | 0.036130 | -0.005822 | 0.162513 | -0.293523 | -0.144770 | 0.033925 |
| bruises | -0.501530 | -0.035374 | 0.070228 | -0.000764 | 1.000000 | -0.061825 | 0.137359 | -0.299473 | -0.369596 | 0.527120 | ... | 0.458983 | 0.083538 | 0.092874 | NaN | 0.119770 | 0.056788 | 0.692973 | -0.285008 | 0.088137 | -0.075095 |
| odor | -0.093552 | -0.021935 | 0.045233 | -0.387121 | -0.061825 | 1.000000 | -0.059590 | 0.063936 | 0.310495 | -0.129213 | ... | 0.061820 | 0.174532 | 0.169407 | NaN | -0.057747 | 0.111905 | -0.281387 | 0.469055 | -0.043623 | -0.026610 |
| gill-attachment | 0.129200 | 0.078865 | -0.034180 | 0.041436 | 0.137359 | -0.059590 | 1.000000 | 0.071489 | 0.108984 | -0.128567 | ... | -0.116177 | 0.099299 | 0.097160 | NaN | 0.897518 | 0.093236 | -0.146689 | -0.029524 | 0.165575 | -0.030304 |
| gill-spacing | -0.348387 | 0.013196 | -0.282306 | 0.144259 | -0.299473 | 0.063936 | 0.071489 | 1.000000 | -0.108333 | 0.100193 | ... | -0.213775 | 0.274574 | 0.253505 | NaN | 0.073363 | 0.243014 | -0.195897 | 0.047323 | -0.529253 | -0.154680 |
| gill-size | 0.540024 | 0.054050 | 0.208100 | -0.169464 | -0.369596 | 0.310495 | 0.108984 | -0.108333 | 1.000000 | -0.516736 | ... | 0.010894 | 0.296548 | 0.278708 | NaN | 0.103809 | -0.171362 | -0.460872 | 0.622991 | 0.147682 | 0.161418 |
| gill-color | -0.530566 | -0.006039 | -0.161017 | 0.084659 | 0.527120 | -0.129213 | -0.128567 | 0.100193 | -0.516736 | 1.000000 | ... | 0.257224 | -0.058299 | -0.074781 | NaN | -0.097583 | 0.096054 | 0.629398 | -0.416135 | -0.034090 | -0.202972 |
| stalk-shape | -0.102019 | 0.063794 | -0.014123 | -0.456496 | 0.099364 | 0.459766 | 0.186485 | 0.080895 | 0.214576 | -0.175699 | ... | -0.034399 | 0.223439 | 0.235794 | NaN | 0.162604 | -0.293221 | -0.291444 | 0.258831 | 0.087383 | -0.269216 |
| stalk-root | -0.379361 | 0.030191 | -0.126245 | 0.321274 | 0.244188 | -0.205215 | 0.144063 | 0.350548 | -0.344345 | 0.315080 | ... | 0.087454 | 0.157140 | 0.159805 | NaN | 0.156213 | -0.247357 | 0.210155 | -0.536996 | -0.306747 | -0.007668 |
| stalk-surface-above-ring | -0.334593 | -0.030417 | 0.089090 | -0.060837 | 0.460824 | 0.118617 | -0.088916 | -0.212359 | 0.056310 | 0.224287 | ... | 0.437164 | 0.132708 | 0.142835 | NaN | -0.090591 | 0.107904 | 0.390091 | 0.100764 | 0.079604 | -0.058076 |
| stalk-surface-below-ring | -0.298801 | -0.032591 | 0.107965 | -0.047710 | 0.458983 | 0.061820 | -0.116177 | -0.213775 | 0.010894 | 0.257224 | ... | 1.000000 | 0.106933 | 0.110656 | NaN | -0.077284 | 0.040006 | 0.394644 | 0.130974 | 0.046797 | -0.039628 |
| stalk-color-above-ring | -0.154003 | -0.031659 | 0.066050 | 0.002364 | 0.083538 | 0.174532 | 0.099299 | 0.274574 | 0.296548 | -0.058299 | ... | 0.106933 | 1.000000 | 0.491510 | NaN | 0.067377 | 0.084917 | -0.048878 | 0.271533 | -0.240261 | 0.042561 |
| stalk-color-below-ring | -0.146730 | -0.030390 | 0.068885 | 0.008057 | 0.092874 | 0.169407 | 0.097160 | 0.253505 | 0.278708 | -0.074781 | ... | 0.110656 | 0.491510 | 1.000000 | NaN | 0.065567 | 0.087580 | -0.034284 | 0.254518 | -0.242792 | 0.041594 |
| veil-type | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| veil-color | 0.145142 | 0.072560 | -0.016603 | 0.036130 | 0.119770 | -0.057747 | 0.897518 | 0.073363 | 0.103809 | -0.097583 | ... | -0.077284 | 0.067377 | 0.065567 | NaN | 1.000000 | 0.036380 | -0.143673 | -0.003600 | 0.124924 | -0.040581 |
| ring-number | -0.214366 | -0.106534 | -0.026147 | -0.005822 | 0.056788 | 0.111905 | 0.093236 | 0.243014 | -0.171362 | 0.096054 | ... | 0.040006 | 0.084917 | 0.087580 | NaN | 0.036380 | 1.000000 | 0.058312 | 0.338417 | -0.242020 | 0.235835 |
| ring-type | -0.411771 | -0.025457 | -0.106407 | 0.162513 | 0.692973 | -0.281387 | -0.146689 | -0.195897 | -0.460872 | 0.629398 | ... | 0.394644 | -0.048878 | -0.034284 | NaN | -0.143673 | 0.058312 | 1.000000 | -0.487048 | 0.211763 | -0.212080 |
| spore-print-color | 0.171961 | -0.073416 | 0.230364 | -0.293523 | -0.285008 | 0.469055 | -0.029524 | 0.047323 | 0.622991 | -0.416135 | ... | 0.130974 | 0.271533 | 0.254518 | NaN | -0.003600 | 0.338417 | -0.487048 | 1.000000 | -0.126859 | 0.185954 |
| population | 0.298686 | 0.063413 | 0.021555 | -0.144770 | 0.088137 | -0.043623 | 0.165575 | -0.529253 | 0.147682 | -0.034090 | ... | 0.046797 | -0.240261 | -0.242792 | NaN | 0.124924 | -0.242020 | 0.211763 | -0.126859 | 1.000000 | -0.174529 |
| habitat | 0.217179 | -0.042221 | 0.163887 | 0.033925 | -0.075095 | -0.026610 | -0.030304 | -0.154680 | 0.161418 | -0.202972 | ... | -0.039628 | 0.042561 | 0.041594 | NaN | -0.040581 | 0.235835 | -0.212080 | 0.185954 | -0.174529 | 1.000000 |
23 rows × 23 columns
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
pca_reduced_df = pd.DataFrame(pca.fit_transform(df.drop(columns=["class"])))
pca_reduced_df.insert(3, "class", df["class"].to_list(), True)
fig = px.scatter_3d(pca_reduced_df, x=0, y=1,z=2, color="class", labels={
"0": "PC 1",
"1": "PC 2",
"2": "PC 3"
})
fig.show()
Before we apply any of the models, we need to divide our dataset into train and test subsets. We are going to leverage sklearn.model_selection.train_test_split to provide bootstrap-based samping. We are going to use 20% of the dataset for testing and 80% for training the models.
X, y = df.drop(columns=["class"]), df["class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
metrics = []
from sklearn.naive_bayes import CategoricalNB
nb_clf = CategoricalNB()
nb_clf.fit(X_train, y_train)
CategoricalNB()
y_pred_nb = nb_clf.predict(X_test)
print(classification_report(y_test, y_pred_nb))
precision recall f1-score support
0 0.91 1.00 0.95 847
1 0.99 0.90 0.94 778
accuracy 0.95 1625
macro avg 0.95 0.95 0.95 1625
weighted avg 0.95 0.95 0.95 1625
print(confusion_matrix(y_test, y_pred_nb))
[[843 4] [ 79 699]]
metrics.append(get_metrics(y_test, y_pred_nb, "Naïve Bayes"))
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda_clf = LinearDiscriminantAnalysis()
lda_clf.fit(X_train, y_train)
LinearDiscriminantAnalysis()
y_pred_lda = lda_clf.predict(X_test)
print(classification_report(y_test, y_pred_lda))
precision recall f1-score support
0 0.93 0.94 0.94 847
1 0.94 0.92 0.93 778
accuracy 0.93 1625
macro avg 0.93 0.93 0.93 1625
weighted avg 0.93 0.93 0.93 1625
print(confusion_matrix(y_test, y_pred_lda))
[[800 47] [ 61 717]]
metrics.append(get_metrics(y_test, y_pred_lda, "LDA"))
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda_clf = QuadraticDiscriminantAnalysis()
qda_clf.fit(X_train, y_train)
d:\projects\pycharmprojects\ml_2021\venv2\lib\site-packages\sklearn\discriminant_analysis.py:808: UserWarning: Variables are collinear
QuadraticDiscriminantAnalysis()
y_pred_qda = lda_clf.predict(X_test)
print(classification_report(y_test, y_pred_qda))
precision recall f1-score support
0 0.93 0.94 0.94 847
1 0.94 0.92 0.93 778
accuracy 0.93 1625
macro avg 0.93 0.93 0.93 1625
weighted avg 0.93 0.93 0.93 1625
print(confusion_matrix(y_test, y_pred_qda))
[[800 47] [ 61 717]]
metrics.append(get_metrics(y_test, y_pred_qda, "QDA"))
models_summary = pd.DataFrame(metrics, columns = ["Model", "Accuracy", "f1", "Precision", "Recall"])
models_summary = models_summary.style.highlight_max(color = 'red', axis = 0)
models_summary
| Model | Accuracy | f1 | Precision | Recall | |
|---|---|---|---|---|---|
| 0 | Naïve Bayes | 0.948923 | 0.943957 | 0.994310 | 0.898458 |
| 1 | LDA | 0.933538 | 0.929961 | 0.938482 | 0.921594 |
| 2 | QDA | 0.933538 | 0.929961 | 0.938482 | 0.921594 |
**Conclusion**: Naïve Bayes classifier provides the highest accuracy, f1-score and precisions, but on the other hand LDA and QDA both have the same recall which is higher that the Naïve Bayes's. The performance achived by the tree models is nearly the same, and the Naïve Bayes dominates.